Data

Packages Used

## Installing package into '/Users/zoearnaut-hull/Library/R/3.6/library'
## (as 'lib' is unspecified)
## Warning: package 'disk.frame' is not available (for R version 3.6.1)
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'disk.frame'
## Warning in p_load(fastverse, magrittr, here, skimr, dplyr, ggplot2, ggthemes, : Failed to install/load:
## disk.frame

Most Frequent Words in Data

Visualing text data after cleaning and pre-processing

Trimming: reducing number of features

We currently have 25,050 variables (too many!)
Defining threshold (eg. 1 == 1%) Goal: Eliminate words that appear in __% of records in the training data

The list of our most frequent words

actual can differ found instal long much onli problem right site test want
address case doe free interest look must order process rpm softwar thank way
also chang doesn get internet lot name origin product run someth thing web
american check don give invest made nation packag program said spam think week
ani click email good issu mail need part provid say spamassassin time well
anoth code end got just make net peopl put secur sponsor today whi
anyon com error govern keep manag network perl rate see start trade will
back come even great know mani never person razor seem state tri window
base compani everi group last market new phone read send still two within
becaus comput exmh help life may next place real sent subject type without
befor countri file high like mean now pleas realli sep support unit work
best current find home line messag numbertnumber point receiv server sure unsubscrib world
better data first hyperlink link might offer post releas servic system use write
build date follow idea linux million old power remov set take user wrote
busi day fork includ list money onc price report show talk veri year
call develop form inform live month one probabl requir sinc technolog version

Preview data

Methods

# I got all of this code from stackoverflow by doing some simple googling for how to display a confusion matrix
## Source https://stackoverflow.com/questions/23891140/r-how-to-visualize-confusion-matrix-using-the-caret-package
draw_confusion_matrix <- function(cm) {
  total <- sum(cm$table)
  res <- as.numeric(cm$table)
  # Generate color gradients. Palettes come from RColorBrewer.
  greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
  redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
  getColor <- function (greenOrRed = "green", amount = 0) {
    if (amount == 0)
      return("#FFFFFF")
    palette <- greenPalette
    if (greenOrRed == "red")
      palette <- redPalette
    colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
  }
  # set the basic layout
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Confusion Matrix', cex.main=2)
  # create the matrix 
  classes = colnames(cm$table)
  rect(150, 430, 240, 370, col=getColor("green", res[1]))
  text(195, 435, "Non-Spam", cex=1.2)
  rect(250, 430, 340, 370, col=getColor("red", res[3]))
  text(295, 435, "Spam", cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col=getColor("red", res[2]))
  rect(250, 305, 340, 365, col=getColor("green", res[4]))
  text(140, 400, "Non-Spam", cex=1.2, srt=90)
  text(140, 335, "Spam", cex=1.2, srt=90)
  # add in the cm results
  text(195, 400, res[1], cex=1.6, font=2, col='black')
  text(195, 335, res[2], cex=1.6, font=2, col='black')
  text(295, 400, res[3], cex=1.6, font=2, col='black')
  text(295, 335, res[4], cex=1.6, font=2, col='black')
  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "Metrics", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
} 

Lasso

## Warning: package 'rlang' was built under R version 3.6.2
## Warning: package 'vctrs' was built under R version 3.6.2
## Warning: package 'glmnet' was built under R version 3.6.2
## Warning: package 'Matrix' was built under R version 3.6.2

Lasso Results

Penalty Metric Estimator Mean n Standard Error .config
0.010 mae standard 0.283 5 0.002 Preprocessor1_Model002
0.031 rmse standard 0.376 5 0.004 Preprocessor1_Model009

Logistic Lasso

Logistic